{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from pathlib import Path\n",
    "import numpy as np\n",
    "from copy import deepcopy\n",
    "import pandas as pd\n",
    "\n",
    "from deeppavlov.core.commands.train import read_data_by_config, train_evaluate_model_from_config\n",
    "from deeppavlov.core.commands.infer import interact_model, build_model\n",
    "from deeppavlov.core.commands.utils import expand_path, parse_config\n",
    "from deeppavlov.core.common.params import from_params\n",
    "from deeppavlov.core.common.errors import ConfigError"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read unlabelled data for label propagation\n",
    "def read_unlabelled_data(UNLABELLED_DATA_PATH):\n",
    "    with open(UNLABELLED_DATA_PATH, \"r\") as f:\n",
    "        unlabelled_data = f.read().splitlines()\n",
    "    unlabelled_data = [x for x in unlabelled_data if x != '']\n",
    "    return unlabelled_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "def make_pl_config(CONFIG_PATH):\n",
    "    config_path_pl = Path(CONFIG_PATH).parent / Path(Path(CONFIG_PATH).stem + \"_pl.json\")\n",
    "\n",
    "    with open(CONFIG_PATH, \"r\") as f:\n",
    "        config = json.load(f)\n",
    "    \n",
    "    config_pl = deepcopy(config)\n",
    "    config_pl[\"dataset_reader\"][\"train\"] = Path(config_pl[\"dataset_reader\"].get(\"train\", \"train.csv\")).stem + \"_pl.csv\"\n",
    "    \n",
    "    with open(config_path_pl, \"w\") as f:\n",
    "        json.dump(config_pl, f, indent=2)\n",
    "    \n",
    "    return config, config_pl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_extended_data(config, samples, labels, new_config = None):\n",
    "    train_data = read_data_by_config(deepcopy(config))\n",
    "    \n",
    "    for i in range(len(samples)):\n",
    "        train_data[\"train\"].append((samples[i], labels[i]))\n",
    "    df = pd.DataFrame(train_data[\"train\"], \n",
    "                      columns=[config[\"dataset_reader\"][\"x\"], \n",
    "                               config[\"dataset_reader\"][\"y\"]])\n",
    "    df[config[\"dataset_reader\"][\"y\"]] = df[config[\"dataset_reader\"][\"y\"]].apply(\n",
    "        lambda x: config[\"dataset_reader\"].get(\"class_sep\", \",\").join(x))\n",
    "    \n",
    "    if new_config is not None:\n",
    "        config = new_config\n",
    "    file = expand_path(Path(config[\"dataset_reader\"][\"data_path\"]) / \n",
    "                       Path(config[\"dataset_reader\"][\"train\"]))\n",
    "\n",
    "    if config[\"dataset_reader\"].get(\"format\", \"csv\") == \"csv\":\n",
    "        keys = ('sep', 'header', 'names')\n",
    "        df.to_csv(file, \n",
    "                  index=False,\n",
    "                  sep=config[\"dataset_reader\"].get(\"sep\", \",\")\n",
    "                 )\n",
    "    elif config[\"dataset_reader\"].get(\"format\", \"csv\") == \"json\":\n",
    "        keys = ('orient', 'lines')\n",
    "        df.to_json(file, \n",
    "                  index=False,\n",
    "                  orient=config[\"dataset_reader\"].get(\"orient\", None),\n",
    "                  lines=config[\"dataset_reader\"].get(\"lines\", False)\n",
    "                  )\n",
    "    else:\n",
    "        raise ConfigError(\"Can not work with current data format\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# manually given parameters for pseudo-labeling\n",
    "\n",
    "# path to config file\n",
    "CONFIG_PATH = \"../deeppavlov/configs/classifiers/convers_vs_info.json\"\n",
    "# read config, compose new one, save it\n",
    "config, config_pl = make_pl_config(CONFIG_PATH)\n",
    "config, config_pl = parse_config(config), parse_config(config_pl)\n",
    "config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# path to file with unlabelled data\n",
    "UNLABELLED_DATA_PATH = expand_path(Path(config[\"dataset_reader\"][\"data_path\"])) / Path(\"question_L6.txt\")\n",
    "# number of samples that are going to be labelled during one iteration of label propagation\n",
    "ONE_ITERATION_PORTION = 100\n",
    "# number of iterations\n",
    "N_ITERATIONS = 10\n",
    "CLASSES_VOCAB_ID_IN_PIPE = 0\n",
    "CONFIDENT_PROBA = 0.9"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read unlabelled dataset\n",
    "unlabelled_data = read_unlabelled_data(UNLABELLED_DATA_PATH)\n",
    "\n",
    "# save initial dataset as extended\n",
    "save_extended_data(config, [], [], new_config=config_pl)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "available_unlabelled_ids = np.arange(len(unlabelled_data))\n",
    "\n",
    "np.random.seed(42)\n",
    "\n",
    "for i in range(N_ITERATIONS):\n",
    "    samples = []\n",
    "    labels = []\n",
    "    \n",
    "    ids_to_label = available_unlabelled_ids[\n",
    "        np.random.randint(low=0, \n",
    "                          high=len(available_unlabelled_ids), \n",
    "                          size=ONE_ITERATION_PORTION)]\n",
    "    available_unlabelled_ids = np.delete(available_unlabelled_ids, ids_to_label)\n",
    "    train_evaluate_model_from_config(deepcopy(config_pl))\n",
    "    model = build_model(deepcopy(config_pl))\n",
    "    classes = np.array(list(from_params(\n",
    "        deepcopy(config_pl[\"chainer\"][\"pipe\"][CLASSES_VOCAB_ID_IN_PIPE])).keys()))\n",
    "\n",
    "    for j, sample_id in enumerate(ids_to_label):\n",
    "        prediction = model([unlabelled_data[sample_id]])[0]\n",
    "        if len(np.where(np.array(prediction) > CONFIDENT_PROBA)[0]):\n",
    "            samples.append(unlabelled_data[sample_id])\n",
    "            labels.append(classes[np.where(np.array(prediction) > CONFIDENT_PROBA)])\n",
    "    \n",
    "    print(\"Iteration {}: add {} samples to train dataset\".format(i, len(samples)))\n",
    "    save_extended_data(config_pl, samples, labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "name": "python3"
  },
  "accelerator": "GPU",
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
